**Project Title: Income and employment for the various county’s in USA*

#census_api_key(Sys.getenv("CENSUS_API_KEY"), install = TRUE)
#install.packages("tidycensus")
# Load necessary libraries
library(tidycensus)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

#install.packages("factoextra")
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("ggplot2")
#install.packages("usmap")
#install.packages("maps")
library(ggplot2)
library(usmap)
library(maps)

DF1 - EMPLOYMENT

#Abhay
#Employment - K202301 for 2021

# Get ACS data
df1 <- get_acs(geography = "state", 
               table = "K202301", 
               year = 2021, 
               survey = "acs1-year", 
               cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202301 and caching the dataset for faster future access.
variables_df1 <- data.frame(
  variable = c("K202301_001", "K202301_002", "K202301_003", "K202301_004", "K202301_005", "K202301_006", "K202301_007"),
  label = c("Total", "In labor force:", "Civilian labor force:", "Employed", 
            "Unemployed", "In Armed Forces", 
            "Not in labor force")
)
# Join with variable labels


# Now df_labeled will have an additional column with the variable labels
# Load variables for ACS 2022
# Sample mapping of variable codes to labels
# Replace this with the actual codes and labels


# Assuming df1 is your data frame with the estimates you've loaded
# Make sure to replace 'NAME' and 'estimate' with the actual column names from your df1

# Join your data with the variable labels to get the descriptive names
df1_labeled <- df1 %>%
  left_join(variables_df1, by = "variable")

# Reshape the data so that state names are rows and variable labels are columns
df1_wide <- df1_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)


# df_wide now has states as columns and the descriptive variable labels as rows
# # Calculate employment rates
# df1_wide <- df1_wide %>%
#   mutate(
#     EmploymentRate = `In labor force: Civilian labor force: Employed` / `Total` * 100,
#     UnemploymentRate = `In labor force: Civilian labor force: Unemployed` / `Total` * 100,
#     NotInLaborForceRate = `Not in labor force` / `Total` * 100
#   )
# 
# # Plotting employment rates
# library(ggplot2)
# 
# ggplot(df1_wide, aes(x = reorder(NAME, -EmploymentRate), y = EmploymentRate)) +
#   geom_bar(stat = "identity", fill = "steelblue") +
#   labs(x = "State", y = "Employment Rate (%)", title = "Employment Rates by State") +
#   theme_minimal() +
#   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# # Plotting labor force participation
# ggplot(df1_wide, aes(x = NAME)) +
#   geom_bar(aes(y = `In labor force: Civilian labor force: Employed`, fill = "Employed"), stat = "identity") +
#   geom_bar(aes(y = `In labor force: Civilian labor force: Unemployed`, fill = "Unemployed"), stat = "identity") +
#   geom_bar(aes(y = `In labor force: Armed Forces`, fill = "Armed Forces"), stat = "identity") +
#   scale_fill_manual(values = c("Employed" = "green", "Unemployed" = "red", "Armed Forces" = "blue")) +
#   theme(axis.text.x = element_text(angle = 90)) +
#   labs(x = "State", y = "Number of People", fill = "Category", title = "Labor Force Participation by State")
# ggplot(df1_wide, aes(x = UnemploymentRate)) +
#   geom_histogram(binwidth = 1, fill = "tomato", color = "black") +
#   labs(x = "Unemployment Rate (%)", y = "Number of States", title = "Distribution of Unemployment Rates across States")
# # Correlation analysis
# employment_data <- df1_wide %>%
#   select(`In labor force: Civilian labor force: Employed`, `In labor force: Civilian labor force: Unemployed`, `In labor force: Armed Forces`, `Not in labor force`)
# 
# correlation_matrix <- cor(employment_data, use = "complete.obs")
# 
# # Visualize the correlation matrix
# library(corrplot)
# corrplot(correlation_matrix, method = "circle")
# # Calculate employment rate
# # Assuming df1_wide or df6_wide has a column 'state' with state names or abbreviations
# # Calculate the metric you want to plot (for example, Employment Rate)
# 
# 
# # Assuming df1_wide has full state names in the 'NAME' column
# # Convert state names to abbreviations
# state_abbreviations <- state.abb[match(df1_wide$NAME, state.name)]
# df1_wide$state <- state_abbreviations
# 
# # Ensure you have calculated the metric you want to plot (e.g., Employment Rate)
# df1_wide$EmploymentRate <- df1_wide$`In labor force: Civilian labor force: Employed` / df1_wide$Total * 100
# 
# # Plot the map with employment rate
# plot_usmap(data = df1_wide, values = "EmploymentRate", labels = TRUE) +
#   scale_fill_continuous(name = "Employment Rate (%)", label = scales::percent_format()) +
#   theme(legend.position = "right") +
#   labs(title = "Employment Rate by State in 2021")


DF6 - Disabilities

#Abhay
#Disabilities - K201803
# Get ACS data for the Disabilities dataset
df6 <- get_acs(geography = "state", 
               table = "K201803", 
               year = 2021, 
               survey = "acs1-year", 
               cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201803 and caching the dataset for faster future access.
# Assuming the variables_df6 mapping is similar to variables_df1 but for the Disabilities table
# You need to create variables_df6 with the correct variable codes and labels for the Disabilities data
variables_df6 <- data.frame(
  variable = c("K201803_001", "K201803_002", "K201803_003", "K201803_004", "K201803_005", "K201803_006", "K201803_007","K201803_008","K201803_009"),
  label = c("Total_people", "Total With Disabilities", 
            "Hearing", "Vision difficulty", 
            "cognative", "ambulatory difficulty", 
            "Self-care difficulty","Independent living difficulty","No Disability")
)

# Join your data with the variable labels to get the descriptive names
df6_labeled <- df6 %>%
  left_join(variables_df6, by = "variable")

# Reshape the data so that state names are rows and variable labels are columns
df6_wide <- df6_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
df6_wide
## # A tibble: 52 × 10
##    NAME          Total_people Total With Disabilit…¹ Hearing `Vision difficulty`
##    <chr>                <dbl>                  <dbl>   <dbl>               <dbl>
##  1 Alabama            4957633                 808071  208028              152798
##  2 Alaska              702154                  92390   33397               15748
##  3 Arizona            7174053                 972252  298849              180792
##  4 Arkansas           2974701                 517051  142133              105624
##  5 California        38724294                4324355 1140131              844049
##  6 Colorado           5715497                 640346  211803              120570
##  7 Connecticut        3557526                 427014  113490               78078
##  8 Delaware            987964                 130551   37933               25335
##  9 District of …       659979                  76754   14429               14569
## 10 Florida           21465883                2906367  812248              555361
## # ℹ 42 more rows
## # ℹ abbreviated name: ¹​`Total With Disabilities`
## # ℹ 5 more variables: cognative <dbl>, `ambulatory difficulty` <dbl>,
## #   `Self-care difficulty` <dbl>, `Independent living difficulty` <dbl>,
## #   `No Disability` <dbl>
# df6_wide now has states as rows and the descriptive variable labels as columns
# Calculate percentages
df6_wide <- df6_wide %>%
  mutate(DisabilityRate = `Total With Disabilities` / `Total_people` * 100)

# Plotting with ggplot2
library(ggplot2)

ggplot(df6_wide, aes(x = reorder(NAME, -DisabilityRate), y = DisabilityRate)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(x = "State", y = "Disability Rate (%)", title = "Disability Rates by State") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Select only the disability-related columns for correlation
disability_data <- df6_wide %>%
  select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)

# Compute correlation matrix
correlation_matrix <- cor(disability_data, use = "complete.obs")  # use = "complete.obs" handles missing values

# Check if corrplot is installed; if not, install it
if (!require(corrplot)) install.packages("corrplot")
## Loading required package: corrplot
## corrplot 0.92 loaded
library(corrplot)
corrplot(correlation_matrix, method = "circle")

# Comparative analysis of disabilities within a state (example: Iowa)
iowa_data <- df6_wide %>%
  filter(NAME == "Iowa") %>%
 select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)

# Plotting the data for Iowa
barplot(as.matrix(iowa_data), beside = TRUE, legend.text = TRUE, args.legend = list(x = "topright"))

# Predicting 'Ambulatory difficulty' based on other disabilities
lm_model <- lm(`ambulatory difficulty` ~ `Hearing` + `Vision difficulty` + `cognative` + `Self-care difficulty` + `Independent living difficulty`, data = df6_wide)

# Summary of the linear model
summary(lm_model)
## 
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` + 
##     cognative + `Self-care difficulty` + `Independent living difficulty`, 
##     data = df6_wide)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -56621 -11941  -2766  17524  73868 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -8200.5856  5646.4977  -1.452 0.153197    
## Hearing                             0.6796     0.1793   3.791 0.000436 ***
## `Vision difficulty`                 1.0070     0.1644   6.126 1.87e-07 ***
## cognative                          -0.5426     0.2291  -2.369 0.022116 *  
## `Self-care difficulty`             -1.9464     0.4434  -4.390 6.59e-05 ***
## `Independent living difficulty`     1.9228     0.3144   6.115 1.95e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24940 on 46 degrees of freedom
## Multiple R-squared:  0.9968, Adjusted R-squared:  0.9965 
## F-statistic:  2909 on 5 and 46 DF,  p-value: < 2.2e-16
lm_model
## 
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` + 
##     cognative + `Self-care difficulty` + `Independent living difficulty`, 
##     data = df6_wide)
## 
## Coefficients:
##                     (Intercept)                          Hearing  
##                      -8200.5856                           0.6796  
##             `Vision difficulty`                        cognative  
##                          1.0070                          -0.5426  
##          `Self-care difficulty`  `Independent living difficulty`  
##                         -1.9464                           1.9228
# K-means clustering
#Perform a clustering analysis to identify groups of states with similar disability profiles.


# K-means clustering with 3 centers
set.seed(123)  # For reproducibility
clusters <- kmeans(disability_data, centers = 3)

# Add cluster assignment to the data
df6_wide$Cluster <- as.factor(clusters$cluster)

pca_res <- prcomp(disability_data, scale. = TRUE)
fviz_pca_biplot(pca_res, label = "none", col.ind = df6_wide$Cluster, addEllipses = TRUE)



DF 2 - Education

#Neha
#Education - K201501
df2 <- get_acs(geography = "state", 
                table = "K201501", 
                year = 2021, 
                survey = "acs1/subject",cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201501 and caching the dataset for faster future access.
variables_df2 <- data.frame(
  variable = c("K201501_001", "K201501_002", "K201501_003", "K201501_004", "K201501_005", "K201501_006", "K201501_007", "K201501_008"),
  label = c("Education_Total_students", "Education_Below_9th grade", "Education_9th to 12th grade_no diploma", "Education_High_school_graduate", "Education_Some college_no degree", "Education_Associate's_degree", "Education_Bachelor's_degree", "Education_Graduate_professional degree")
)

df2_labeled <- df2 %>%
  left_join(variables_df2, by = "variable")

df2_wide <- df2_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)


DF3 - Citizenship

#Esha
#Citizenship - K200501

df3<- get_acs(
  geography = "state", 
  table = "K200501", 
  year = 2021, 
  survey = "acs1/subject", 
  cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200501 and caching the dataset for faster future access.
variables_df3 <- data.frame(
  variable = c("K200501_001", "K200501_002","K200501_003"),  
  label = c("Total","U.S. citizen", "Not a U.S. citizen")  
)

df3_labeled <- df3 %>%
  left_join(variables_df3, by = "variable")

df3_wide <- df3_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)

***


DF4 - Age

#Srika
#Age - K200104
df4 <- get_acs(geography = "state", 
                table = "K200104", 
                year = 2021,
                survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
variables_df4 <- data.frame(
  variable = c("K200104_001", "K200104_002", "K200104_003", "K200104_004", "K200104_005", "K200104_006", "K200104_007", "K200104_008"),

  label = c("Total_age", "Age_under_18", 
            "Age_18_to_24", "Age_25_to_34", 
            "Age_35_to_44", "Age_45_to_54", "Age_55_to_64","Age_over_64")
)

df4_labeled <- df4 %>%
  left_join(variables_df4, by = "variable")

df4_wide <- df4_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ stringr   1.5.0
## ✔ purrr     1.0.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::map()    masks maps::map()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
Age_data <- get_acs(geography = "state", 
                table = "K200104", 
                year = 2021, 
               geometry = TRUE,
               resolution = "20m",
                survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |======================================================================| 100%
Mapping_age<-Age_data%>%
  filter(variable=="K200104_007")%>%
  shift_geometry()
ggplot(data = Mapping_age, aes(fill = estimate)) + 
  geom_sf()+
  scale_fill_distiller(palette = "RdPu", 
                       direction = 1) + 
  labs(title = "Median Age by State, 2021",
       caption = "Data source: 2021 1-year ACS, US Census Bureau",
       fill = "ACS estimate") + 
  theme_void()



DF5 - Housing

#Niharika
# Housing - K202502
df5 <- get_acs(geography = "state", 
                table = "K202502", 
                year = 2021, 
                survey = "acs1", # removed /subject since we're referring to a specific table ID
                cache_table = TRUE)
## Getting data from the 2021 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202502 and caching the dataset for faster future access.
# Define the variable codes and their respective labels for the housing dataset (df5)
variables_df5 <- data.frame(
  variable = c("K202502_001", "K202502_002", "K202502_003"),
  label = c("Total", "Owner Occupied", "Renter Occupied")
)

# Assuming df5 is your housing dataset
# Reshape the data so that state names are rows
df5_labeled <- df5 %>%
  left_join(variables_df5, by = "variable")  # Joining with the variable labels dataframe

# Reshape the data to have state names as rows and variable labels as columns
df5_wide <- df5_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)

# df5_wide will have states as rows and different housing-related variables as columns

Plan for Data Exploration

merged_data <- df1_wide %>%
  left_join(df2_wide, by = "NAME") %>%
  left_join(df3_wide, by = "NAME")%>%
  left_join(df4_wide, by = "NAME")%>%
  left_join(df5_wide, by = "NAME")%>%
  left_join(df6_wide, by = "NAME")


columns_to_drop <- c('Total.x', 'Total.y', 'Total.x.x', 'Total.y.y')
merged_data <- merged_data[, !(names(merged_data) %in% columns_to_drop)]

colnames(merged_data)
##  [1] "NAME"                                  
##  [2] "In labor force:"                       
##  [3] "Civilian labor force:"                 
##  [4] "Employed"                              
##  [5] "Unemployed"                            
##  [6] "In Armed Forces"                       
##  [7] "Not in labor force"                    
##  [8] "Education_Total_students"              
##  [9] "Education_Below_9th grade"             
## [10] "Education_9th to 12th grade_no diploma"
## [11] "Education_High_school_graduate"        
## [12] "Education_Some college_no degree"      
## [13] "Education_Associate's_degree"          
## [14] "Education_Bachelor's_degree"           
## [15] "Education_Graduate_professional degree"
## [16] "U.S. citizen"                          
## [17] "Not a U.S. citizen"                    
## [18] "Total_age"                             
## [19] "Age_under_18"                          
## [20] "Age_18_to_24"                          
## [21] "Age_25_to_34"                          
## [22] "Age_35_to_44"                          
## [23] "Age_45_to_54"                          
## [24] "Age_55_to_64"                          
## [25] "Age_over_64"                           
## [26] "Total"                                 
## [27] "Owner Occupied"                        
## [28] "Renter Occupied"                       
## [29] "Total_people"                          
## [30] "Total With Disabilities"               
## [31] "Hearing"                               
## [32] "Vision difficulty"                     
## [33] "cognative"                             
## [34] "ambulatory difficulty"                 
## [35] "Self-care difficulty"                  
## [36] "Independent living difficulty"         
## [37] "No Disability"                         
## [38] "DisabilityRate"                        
## [39] "Cluster"

************************

Title

“Socio-Economic Factors Influencing Employment in the United States: A Comprehensive State-by-State Analysis”

Thesis

This project aims to analyze the impact of various socio-economic factors such as education, age, citizenship status, housing, and disabilities on employment rates across different states in the United States. Using data from the American Community Survey (ACS) 2021, the study will employ statistical techniques to identify correlations, trends, and patterns, thereby providing insights into the multifaceted nature of employment dynamics in the US.

Roadmap and Code for Next Steps

  1. Data Exploration and Cleaning

    • Continue with data cleaning and handling missing values.

    • Normalize the data if required for better comparison.

# Checking for missing values
sum(is.na(merged_data))
## [1] 2
# Handling missing values (example: replacing with median)
merged_data <- merged_data %>% 
  mutate_at(vars(-group_cols()), ~ifelse(is.na(.), median(., na.rm = TRUE), .))

# Normalizing the data (example: Min-Max Scaling)
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

merged_data_normalized <- as.data.frame(lapply(merged_data[, sapply(merged_data, is.numeric)], normalize))
  1. Exploratory Data Analysis (EDA)

    • Generate summary statistics and visualizations for each dataset to understand distributions and detect outliers.

    • Explore the relationships between different variables using scatter plots, box plots, etc.

# Summary Statistics
summary(merged_data)
##      NAME           In labor force:    Civilian labor force:    Employed       
##  Length:52          Min.   :  301303   Min.   :  298603      Min.   :  287432  
##  Class :character   1st Qu.:  897042   1st Qu.:  892926      1st Qu.:  845392  
##  Mode  :character   Median : 2134618   Median : 2113270      Median : 1975060  
##                     Mean   : 3259412   Mean   : 3233640      Mean   : 3028193  
##                     3rd Qu.: 3897332   3rd Qu.: 3877097      3rd Qu.: 3627645  
##                     Max.   :19961610   Max.   :19805371      Max.   :18156051  
##    Unemployed      In Armed Forces  Not in labor force Education_Total_students
##  Min.   :  11171   Min.   :  1313   Min.   :  160319   Min.   :  395348        
##  1st Qu.:  47280   1st Qu.:  4381   1st Qu.:  531030   1st Qu.: 1263471        
##  Median : 132957   Median : 12108   Median : 1463585   Median : 3054251        
##  Mean   : 205447   Mean   : 25771   Mean   : 1930062   Mean   : 4434517        
##  3rd Qu.: 216516   3rd Qu.: 25365   3rd Qu.: 2270377   3rd Qu.: 5085510        
##  Max.   :1649320   Max.   :156239   Max.   :11545627   Max.   :26909869        
##  Education_Below_9th grade Education_9th to 12th grade_no diploma
##  Min.   :   7224           Min.   :  17461                       
##  1st Qu.:  40727           1st Qu.:  65815                       
##  Median : 108208           Median : 173250                       
##  Mean   : 214804           Mean   : 261375                       
##  3rd Qu.: 212026           3rd Qu.: 294885                       
##  Max.   :2380657           Max.   :1804222                       
##  Education_High_school_graduate Education_Some college_no degree
##  Min.   :  70925                Min.   :  56221                 
##  1st Qu.: 335012                1st Qu.: 263017                 
##  Median : 813017                Median : 618456                 
##  Mean   :1166698                Mean   : 852411                 
##  3rd Qu.:1441058                3rd Qu.:1014627                 
##  Max.   :5578997                Max.   :5287901                 
##  Education_Associate's_degree Education_Bachelor's_degree
##  Min.   :  15235              Min.   :  73255            
##  1st Qu.: 123095              1st Qu.: 243453            
##  Median : 273742              Median : 547876            
##  Mean   : 389473              Mean   : 941835            
##  3rd Qu.: 460085              3rd Qu.:1259927            
##  Max.   :2120275              Max.   :5958030            
##  Education_Graduate_professional degree  U.S. citizen      Not a U.S. citizen
##  Min.   :  42363                        Min.   :  567083   Min.   :  11069   
##  1st Qu.: 158678                        1st Qu.: 1820268   1st Qu.:  66793   
##  Median : 358179                        Median : 4404876   Median : 148529   
##  Mean   : 607922                        Mean   : 6059088   Mean   : 411049   
##  3rd Qu.: 863754                        3rd Qu.: 6893788   3rd Qu.: 423286   
##  Max.   :3779787                        Max.   :34499330   Max.   :4738506   
##    Total_age         Age_under_18      Age_18_to_24      Age_25_to_34    
##  Min.   :  578803   Min.   : 116767   Min.   :  51926   Min.   :  70511  
##  1st Qu.: 1871432   1st Qu.: 442838   1st Qu.: 169060   1st Qu.: 235319  
##  Median : 4377774   Median : 986696   Median : 402442   Median : 589334  
##  Mean   : 6445333   Mean   :1423482   Mean   : 587334   Mean   : 874641  
##  3rd Qu.: 7391910   3rd Qu.:1629658   3rd Qu.: 683308   3rd Qu.:1039599  
##  Max.   :39237836   Max.   :8769779   Max.   :3558188   Max.   :5827869  
##   Age_35_to_44      Age_45_to_54      Age_55_to_64      Age_over_64     
##  Min.   :  78268   Min.   :  65531   Min.   :  66238   Min.   :  85615  
##  1st Qu.: 245703   1st Qu.: 216349   1st Qu.: 238707   1st Qu.: 320627  
##  Median : 584114   Median : 524686   Median : 559544   Median : 767373  
##  Mean   : 848734   Mean   : 790186   Mean   : 831869   Mean   :1089087  
##  3rd Qu.: 961656   3rd Qu.: 888609   3rd Qu.: 958270   3rd Qu.:1272226  
##  Max.   :5414686   Max.   :4928928   Max.   :4773860   Max.   :5964526  
##      Total          Owner Occupied    Renter Occupied    Total_people     
##  Min.   :  242763   Min.   : 132936   Min.   :  69516   Min.   :  570046  
##  1st Qu.:  715121   1st Qu.: 524154   1st Qu.: 192915   1st Qu.: 1848408  
##  Median : 1743262   Median :1143470   Median : 569181   Median : 4317403  
##  Mean   : 2475206   Mean   :1619184   Mean   : 856022   Mean   : 6349048  
##  3rd Qu.: 2868856   3rd Qu.:1912862   3rd Qu.:1032259   3rd Qu.: 7285134  
##  Max.   :13429063   Max.   :7502706   Max.   :5926357   Max.   :38724294  
##  Total With Disabilities    Hearing        Vision difficulty   cognative      
##  Min.   :  76754         Min.   :  14429   Min.   : 12915    Min.   :  27746  
##  1st Qu.: 257303         1st Qu.:  84763   1st Qu.: 48799    1st Qu.: 104446  
##  Median : 660485         Median : 185961   Median :119128    Median : 261010  
##  Mean   : 830722         Mean   : 226790   Mean   :158917    Mean   : 323599  
##  3rd Qu.: 979094         3rd Qu.: 288780   3rd Qu.:193366    3rd Qu.: 374062  
##  Max.   :4324355         Max.   :1140131   Max.   :844049    Max.   :1710305  
##  ambulatory difficulty Self-care difficulty Independent living difficulty
##  Min.   :  33324       Min.   : 10601       Min.   :  23311              
##  1st Qu.: 109640       1st Qu.: 43107       1st Qu.:  77948              
##  Median : 291072       Median :114600       Median : 229318              
##  Mean   : 400503       Mean   :154262       Mean   : 296658              
##  3rd Qu.: 462491       3rd Qu.:171742       3rd Qu.: 346728              
##  Max.   :2094886       Max.   :972688       Max.   :1741491              
##  No Disability      DisabilityRate     Cluster     
##  Min.   :  490741   Min.   :10.35   Min.   :1.000  
##  1st Qu.: 1570278   1st Qu.:12.05   1st Qu.:1.000  
##  Median : 3608154   Median :13.46   Median :2.000  
##  Mean   : 5518325   Mean   :13.75   Mean   :1.712  
##  3rd Qu.: 6306040   3rd Qu.:14.38   3rd Qu.:2.000  
##  Max.   :34399939   Max.   :22.01   Max.   :3.000
# Load necessary library
library(ggplot2)
library(rlang)
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice
# Loop through each column in the dataframe
for (i in names(merged_data)) {
  # Convert column name to a symbol
  col_sym <- sym(i)

  # Check if the column is numeric for histograms
  if (is.numeric(merged_data[[i]])) {
    p <- ggplot(merged_data, aes(x = !!col_sym)) + 
      geom_histogram(bins = 30, fill = "blue", color = "black") +
      theme_minimal() +
      labs(title = paste("Histogram of", i), x = i, y = "Frequency")
    
  # If not numeric, use bar plot for categorical data
  } else {
    p <- ggplot(merged_data, aes(x = !!col_sym)) +
      geom_bar(fill = "blue", color = "black") +
      theme_minimal() +
      labs(title = paste("Bar Plot of", i), x = i, y = "Count") +
      theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x labels for readability
  }

  # Print the plot
  print(p)
}

  1. Correlation Analysis

    • Calculate correlation matrices for the variables related to employment, education, age, etc., to identify strong correlations.
  2. Statistical Testing

    • Conduct hypothesis testing (e.g., t-tests, ANOVA) to determine if differences in employment rates across different groups (like age groups, educational levels) are statistically significant.
  3. Predictive Modeling

    • Build regression models to predict employment rates based on other variables.

    • Explore other predictive models if applicable (e.g., decision trees, random forests).

  4. Clustering Analysis

    • Use k-means or hierarchical clustering to find patterns or groups in the data based on socio-economic factors.
  5. Data Visualization and Mapping

    • Create comprehensive visualizations and maps to represent the findings. For instance, use ggplot2 and usmap for spatial data representation.
  6. Interpretation and Conclusion

    • Interpret the results from the statistical analyses and models.

    • Draw conclusions about the socio-economic factors affecting employment.

  7. Reporting

    • Compile the results and interpretations into a final report or presentation.